{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Importando bibliotecas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:1212: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
      "  warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import gensim\n",
    "import nltk\n",
    "from  nltk.corpus import stopwords\n",
    "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n",
    "from gensim.test.utils import common_texts, get_tmpfile\n",
    "import datetime\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn import svm\n",
    "from sklearn import tree\n",
    "from sklearn.neighbors.nearest_centroid import NearestCentroid\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
    "from sklearn.manifold import TSNE\n",
    "from bokeh.models import ColumnDataSource, LabelSet, LayoutDOM, Plot\n",
    "from bokeh.plotting import figure, show, output_file\n",
    "from bokeh.io import output_notebook\n",
    "from bokeh.core.properties import Instance, String\n",
    "from bokeh.resources import INLINE\n",
    "import bokeh.io"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Importando base concatenada e misturando os artigos para evitar que haja um viés devido a ordem de leitura dos documentos."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "abstract = pd.read_json(\"Elsevier_abstract - Consolidado.json\")\n",
    "#Misturando as linhas e resetando o index\n",
    "abstract = abstract.sample(frac=1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessamento"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Retirando palavras e espaços"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Retirando as palavras \"Abstract\", \"Unknown\", \"Publisher Summary\", \"Summary\", \"Fundamento\"\n",
    "# que aparece no início de vários textos\n",
    "abstract[\"Abstract\"] = abstract[\"Abstract\"].str.replace(\"Abstract\", \"\")\n",
    "abstract[\"Abstract\"] = abstract[\"Abstract\"].str.replace(\"Unknown\", \"\")\n",
    "abstract[\"Abstract\"] = abstract[\"Abstract\"].str.replace(\"Publisher Summary\", \"\")\n",
    "abstract[\"Abstract\"] = abstract[\"Abstract\"].str.replace(\"Summary\", \"\")\n",
    "abstract[\"Abstract\"] = abstract[\"Abstract\"].str.replace(\"Fundamento\", \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Separando o texto em uma lista de palavras desconsiderando espaços e \\n\n",
    "abstract[\"Abstract\"] = abstract[\"Abstract\"].str.split()      \n",
    "# Unificando a lista de palavras usando espaços simples\n",
    "abstract[\"Abstract\"] = abstract[\"Abstract\"].str.join(\" \")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Eliminando as linhas sem Abstract"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "abstract = abstract[abstract[\"Abstract\"] != \"\"]\n",
    "abstract = abstract.reset_index()\n",
    "abstract = abstract.drop(columns=['index'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Realizando o pré-processamento simple do Gensim basicamente para gerar os tokens dos textos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "abstract[\"Abstract\"] = abstract[\"Abstract\"].apply(gensim.utils.simple_preprocess)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Gravando arquivo com os documentos pré-processados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "abstract.to_json(\"Elsevier_abstract - Preprocessado.json\")\n",
    "abstract = pd.read_json(\"Elsevier_abstract - Preprocessado.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Artigos no qual um dos autores é afiliado a empresas de O&G:  14418\n",
      "Artigos em que nenhum dos autores é afiliado a empresas de O&G:  11227\n"
     ]
    }
   ],
   "source": [
    "print (\"Artigos no qual um dos autores é afiliado a empresas de O&G: \",len(abstract[abstract[\"O&G\"] == 1]))\n",
    "print (\"Artigos em que nenhum dos autores é afiliado a empresas de O&G: \",len(abstract[abstract[\"O&G\"] == 0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dividindo os dados usando 90% para treino e 10% para teste."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Conjunto de Treino\n",
      "Artigos no qual um dos autores é afiliado a empresas de O&G:  12962\n",
      "Artigos em que nenhum dos autores é afiliado a empresas de O&G:  10118\n",
      "\n",
      "Conjunto de Teste\n",
      "Artigos no qual um dos autores é afiliado a empresas de O&G:  1456\n",
      "Artigos em que nenhum dos autores é afiliado a empresas de O&G:  1109\n"
     ]
    }
   ],
   "source": [
    "# Dividindo os dados em treino e teste\n",
    "abstract_train, abstract_test = train_test_split(abstract, test_size=0.1)\n",
    "print (\"Conjunto de Treino\")\n",
    "print (\"Artigos no qual um dos autores é afiliado a empresas de O&G: \",\n",
    "       len(abstract_train[abstract_train[\"O&G\"] == 1]))\n",
    "print (\"Artigos em que nenhum dos autores é afiliado a empresas de O&G: \",\n",
    "       len(abstract_train[abstract_train[\"O&G\"] == 0]))\n",
    "print (\"\")\n",
    "print (\"Conjunto de Teste\")\n",
    "print (\"Artigos no qual um dos autores é afiliado a empresas de O&G: \",\n",
    "       len(abstract_test[abstract_test[\"O&G\"] == 1]))\n",
    "print (\"Artigos em que nenhum dos autores é afiliado a empresas de O&G: \",\n",
    "       len(abstract_test[abstract_test[\"O&G\"] == 0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Aumentando os dados de treinamento deixando uma proporção de 2/3 dos artigos de O&G e 1/3 para não O&G."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Conjunto de Treino Aumentado\n",
      "Artigos no qual um dos autores é afiliado a empresas de O&G:  25876\n",
      "Artigos em que nenhum dos autores é afiliado a empresas de O&G:  30426\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#augmented_abstract_train = abstract_train\n",
    "# Triplicando os artigos de O&G\n",
    "#augmented_abstract_train = augmented_abstract_train.append(abstract_train[abstract_train[\"O&G\"]==1],\n",
    "#                                                           ignore_index=True)\n",
    "#augmented_abstract_train = augmented_abstract_train.append(abstract_train[abstract_train[\"O&G\"]==1],\n",
    "#                                                           ignore_index=True)\n",
    "# Duplicando os artigos de que não são O&G\n",
    "#augmented_abstract_train = augmented_abstract_train.append(abstract_train[abstract_train[\"O&G\"]==0],\n",
    "#                                                           ignore_index=True)\n",
    "# Embaralhando os documentos\n",
    "#augmented_abstract_train = augmented_abstract_train.sample(frac=1).reset_index(drop=True)\n",
    "\n",
    "#print (\"Conjunto de Treino Aumentado\")\n",
    "#print (\"Artigos no qual um dos autores é afiliado a empresas de O&G: \", \n",
    "#       len(augmented_abstract_train[augmented_abstract_train[\"O&G\"] == 1]))\n",
    "#print (\"Artigos em que nenhum dos autores é afiliado a empresas de O&G: \",\n",
    "#       len(augmented_abstract_train[augmented_abstract_train[\"O&G\"] == 0]))\n",
    "#print (\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Vetorizando e classificando os documentos"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Gerando os modelos Doc2Vec utilizando os artigos provenientes do conjunto de treino"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tempo total decorrido:  0:03:45.266000\n"
     ]
    }
   ],
   "source": [
    "momentoInicial = datetime.datetime.now()   # Inicia um contador do tempo\n",
    "\n",
    "# Todos os documentos devem passar pela função TaggedDocument antes de ser vetorizado\n",
    "documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(abstract_train[\"Abstract\"])]\n",
    "#documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(augmented_abstract_train[\"Abstract\"])]\n",
    "#Hiperparâmetros do modelo Doc2Vec\n",
    "Abstract_doc2vec = Doc2Vec(documents,\n",
    "                           vector_size=50,\n",
    "                           dm=0,\n",
    "                           window=5,\n",
    "                           min_count=1,\n",
    "                           workers=7,\n",
    "                           epochs=100,\n",
    "                           alpha=0.025,\n",
    "                           min_alpha=0.00025)\n",
    "\n",
    "momentoFinal = datetime.datetime.now() #Encerrando o contador do tempo\n",
    "print(\"Tempo total decorrido: \", momentoFinal - momentoInicial)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gravando modelo Doc2Vec\n",
    "fname = \"Abstract_doc2vec_model\"\n",
    "Abstract_doc2vec.save(fname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lendo modelo Doc2Vec\n",
    "fname = \"Abstract_doc2vec_model\"\n",
    "Abstract_doc2vec = Doc2Vec.load(fname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tempo total decorrido:  0:06:33.761800\n"
     ]
    }
   ],
   "source": [
    "# Utilizar o modelo Doc2Vec gerado para inferir os vetores para todos os artigos.\n",
    "\n",
    "momentoInicial = datetime.datetime.now()   # Inicia um contador do tempo\n",
    "\n",
    "abstract[\"Doc2Vec\"] = abstract[\"Abstract\"].apply(Abstract_doc2vec.infer_vector)\n",
    "\n",
    "momentoFinal = datetime.datetime.now() #Encerrando o contador do tempo\n",
    "print(\"Tempo total decorrido: \", momentoFinal - momentoInicial)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tempo total decorrido:  0:06:41.745800\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "# Utilizar o modelo Doc2Vec gerado para inferir \n",
    "# os vetores para todos os abstract_train e abstract_test\n",
    "\n",
    "momentoInicial = datetime.datetime.now()   # Inicia um contador do tempo\n",
    "\n",
    "abstract_train[\"Doc2Vec\"] = abstract_train[\"Abstract\"].apply(Abstract_doc2vec.infer_vector)\n",
    "#augmented_abstract_train[\"Doc2Vec\"] = augmented_abstract_train[\"Abstract\"].apply(Abstract_doc2vec.infer_vector)\n",
    "abstract_test[\"Doc2Vec\"] = abstract_test[\"Abstract\"].apply(Abstract_doc2vec.infer_vector)\n",
    "\n",
    "momentoFinal = datetime.datetime.now() #Encerrando o contador do tempo\n",
    "print(\"Tempo total decorrido: \", momentoFinal - momentoInicial)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Modelo de classificação"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Separando os vetores com atributos e as classes \n",
    "features_train = np.array(abstract_train[\"Doc2Vec\"].tolist())\n",
    "#features_train = np.array(augmented_abstract_train[\"Doc2Vec\"].tolist())\n",
    "features_test = np.array(abstract_test[\"Doc2Vec\"].tolist())\n",
    "label_train = np.array(abstract_train[\"O&G\"])\n",
    "#label_train = np.array(augmented_abstract_train[\"O&G\"])\n",
    "label_test = np.array(abstract_test[\"O&G\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Treinando uma Árvore de Decisão para identificar as features mais importantes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tempo total decorrido:  0:00:01.995000\n"
     ]
    }
   ],
   "source": [
    "# Treinando uma Decision Tree com todas as 50 dimensões \n",
    "# como atributos para a classificação\n",
    "\n",
    "momentoInicial = datetime.datetime.now()   # Inicia um contador do tempo\n",
    "\n",
    "clf = tree.DecisionTreeClassifier()\n",
    "clf.fit(features_train, label_train)        # Treinando o classificador\n",
    "\n",
    "momentoFinal = datetime.datetime.now() #Encerrando o contador do tempo\n",
    "print(\"Tempo total decorrido: \", momentoFinal - momentoInicial)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Usando para o treinamento apenas os \"N\" atributos mais importantes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Número de atributos que será usado para treinamento\n",
    "N = 50    \n",
    "# Usando classificador decision tree para achar os atributos mais importantes\n",
    "important = np.argsort(clf.feature_importances_).tolist()\n",
    "important = important[-N:]\n",
    "\n",
    "#Usando apenas os astributos mais importantes\n",
    "features_train = pd.DataFrame(features_train)[important]\n",
    "features_test = pd.DataFrame(features_test)[important]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tempo total decorrido:  0:14:47.819000\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  % self.max_iter, ConvergenceWarning)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>F1-Score</th>\n",
       "      <td>0.864836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Acurácia</th>\n",
       "      <td>0.865107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Precisão</th>\n",
       "      <td>0.864874</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Revocação</th>\n",
       "      <td>0.865107</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  0\n",
       "F1-Score   0.864836\n",
       "Acurácia   0.865107\n",
       "Precisão   0.864874\n",
       "Revocação  0.865107"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Treinando uma Suport Vector Machine (SVM) com todas as 100 dimensões \n",
    "# como atributos para a classificação\n",
    "\n",
    "momentoInicial = datetime.datetime.now()   # Inicia um contador do tempo\n",
    "\n",
    "# Métricas de avaliação\n",
    "f1 = []\n",
    "acc = []\n",
    "precision = []\n",
    "recall = []\n",
    "\n",
    "# Criando os classificadores\n",
    "#clf = svm.SVC()                                  \n",
    "#clf = RandomForestClassifier(n_estimators=100)   \n",
    "#clf = tree.DecisionTreeClassifier()\n",
    "#clf = NearestCentroid()\n",
    "#clf = KNeighborsClassifier()\n",
    "#clf = GaussianNB()\n",
    "#clf = MLPClassifier(solver='sgd')\n",
    "clf = MLPClassifier(solver='sgd', hidden_layer_sizes=5000 )\n",
    "\n",
    "clf.fit(features_train, label_train)        # Treinando o classificador\n",
    "# Predizendo o conjunto de teste com base do classificador treinado\n",
    "label_pred = clf.predict(features_test)     \n",
    "    \n",
    "# Definindo as métricas F1, Acurácia, Precisão e Revocação\n",
    "precision, recall, f1, support = precision_recall_fscore_support(label_test,\n",
    "                                                                 label_pred,\n",
    "                                                                 average='weighted')\n",
    "acc = accuracy_score(label_test,\n",
    "                     label_pred)\n",
    "\n",
    "# Criando um DataFrame com os resultados\n",
    "scores = {'F1-Score':f1,\n",
    "          'Acurácia':acc,\n",
    "          'Precisão':precision,\n",
    "          'Revocação':recall}\n",
    "scores = pd.DataFrame.from_dict(scores, orient='index')\n",
    "\n",
    "momentoFinal = datetime.datetime.now() #Encerrando o contador do tempo\n",
    "print(\"Tempo total decorrido: \", momentoFinal - momentoInicial)\n",
    "\n",
    "scores"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
